#############################
# Simulation Plots #####
#############################
# created by Thomas Däubler
# thomas.daubler@ucd.ie

# set working directory to replication folder
#setwd()
# if (Sys.getenv("USERNAME")=="Thomas") {
#   setwd("C:/Users/Thomas/Dropbox/Env_Voting/code/replication")
# } else setwd("C:/Users/quossfa/Dropbox/Env_Voting/code/replication")

# R version 4.3.3
library(readstata13) # 0.10.1
library(dplyr) # 1.1.4
library(stringr) # 1.5.1
library(ggplot2) # 3.5.0
library(sampling) # 2.10
library(MASS) # 7.3-60.0.1
library(forcats)
# code below also calls (not loaded to avoid function conflicts:
# plyr # 1.8.9

# Figures 3 and 4 ####
# load and inspect

d <- read.dta13("w4_final_recoded_local.dta")
class(d$displayplace) # 1-3
class(d$agematch2) # 0-2
d$displayplace <- as.factor(d$displayplace)
d$agematch2 <- as.factor(d$agematch2)
# Stata drops obs. because of missings in the policy distance variables
d <- d[!is.na(d$lr_diff) & !is.na(d$env_score_diff),]
table(d$w4_treat6, d$w4_treat6_cor)
# equal to Table A.3 Ns: 28512 26694 27828 25164

# briefly inspect regr results
myd1 <- read.dta13("rologit1.dta")
myd1[,1]
myd1[17:18,1] # lrdiff and envdiff


# Simulate the coefficients 

set.seed(67738288)
# set number of draws of coef
mynsim <- 1000 
# simulate coefficients based on exported regr. results
# x is 1,2,3,4 for split models by treatment group
mydrawer <- function(x, add="", nsim = mynsim){
  myd <- read.dta13(paste0("rologit",add, x,".dta"))
  coefs <- myd[,1]
  coefs <- coefs[coefs != 0]
  vcov <- myd[,2:ncol(myd)] 
  vcov <- vcov[vcov[,1] != 0, ]
  vcov <- vcov[, vcov[1,] != 0]
  bs <- t(mvrnorm(n=nsim,coefs,vcov)) 
}

bsro <- plyr::llply(1:4,mydrawer) # 4-list of coef X nsim

# diff of coef across treatment groups 

# indices: env_diff 15, lr_diff 14, gender 3, age5y 4, age10y 5, residence 2
# (cp. reg formula below)
coefind <- c(15,14,3,4,5,2)
rowMeans(bsro[[1]])[coefind]

# compare diff of coefs c in model mtrt to ctrl group
mycomp <- function(mtrt, c = coefind){
  x0 <- bsro[[1]][c,]
  x1 <- bsro[[mtrt]][c,]
  out <- data.frame(mean = rowMeans(x1-x0),
                    lo = apply(x1-x0, 1, quantile, probs = .025),
                    hi = apply(x1-x0, 1, quantile, probs = .975)
  )
  rownames(out) <- c("env","lr","gender","age5","age10" , "resid")
  return(out)
}

mycomp(2)
mycomp(3)
mycomp(4)


# Functions for calculating stuff of interest 

# modelno is 1 to 4 (treatments)
# cind is 1 to 18 (alternative/candidate index)

# compare match==1 to match == 0, rest is as-observed
myfunmatch <- function(cind = 1, 
                       targetmatch = "gendermatch",
                       data = d, modelno = 1, bs = bsro, pidonly = FALSE){
  data <- filter(data, w4_treat6_cor == modelno) 
  b <- bs[[modelno]]
  stopifnot(targetmatch %in% c("gendermatch","agematch","localitymatch"))
  if(targetmatch == "agematch") targetmatch <- "agematch21" # <= 5 years
  m <- model.matrix(~ partyidmatch + localitymatch + gendermatch +  agematch2 + 
                      displayplace + w4_listplace_cand_cat + 
                      lr_diff + env_score_diff +  w4_partyid_candv2,
                    data = data)[,-1] # (remove intercept)
  if(pidonly == TRUE) csel <- data$choice == cind & data$partyidmatch == 1
  else csel <- data$choice == cind
  # Scenario 1: match 0, distance as observed
  ma <- m
  ma[csel, targetmatch] <- 0
  if(targetmatch == "agematch") ma[csel, "agematch2Agediff. 5 to 10 yrs"] <- 0
  pp1 <- exp(ma %*% b) # e^xb, with dim obs(long format)*nsim
  # for each col/sim, sum the e^xb by respondent for the denominator of the predicted prob:
  pps1 <- t(plyr::adply(pp1, 2, .fun=ave, data$pubid,FUN=sum)[,-1])
  # e^xb / sum(e^xb):
  p1 <- pp1[csel,]/pps1[csel,]
  # Scenario 2: match 1, distance as obs
  ma <- m
  ma[csel, targetmatch] <- 1
  if(targetmatch == "agematch") ma[csel, "agematch2Agediff. 5 to 10 yrs"] <- 0
  pp2 <- exp(ma %*% b)
  pps2 <- t(plyr::adply(pp2, 2, .fun=ave, data$pubid,FUN=sum)[,-1])
  p2 <- pp2[csel,]/pps2[csel,]
  out <- data.frame(cind = cind,
                    # m0 = colMeans(p1),
                    # m1 = colMeans(p2),
                    diff = colMeans(p2-p1)) 
  return(out)
}

# compare as-observed-distance-increased-by-1 to as-observed
myfundist <- function(cind = 1, 
                      targetdist = "env_score_diff",
                      data = d, modelno = 1, bs = bsro, pidonly = FALSE){
  data <- filter(data, w4_treat6_cor == modelno) 
  b <- bs[[modelno]]
  m <- model.matrix(~ partyidmatch + localitymatch + gendermatch +  agematch2 + 
                      displayplace + w4_listplace_cand_cat + 
                      lr_diff + env_score_diff +  w4_partyid_candv2,
                    data = data)[,-1] # (remove intercept)
  if(pidonly == TRUE) csel <- data$choice == cind & data$partyidmatch == 1
  else csel <- data$choice == cind
  # Scenario O(bserved) - actually not needed, since mean of that is always 1/18
  # Scenario 4: match as observed, distance + 1
  ma <- m
  ma[csel, targetdist] <- 1 + ma[csel, targetdist]
  pp4 <- exp(ma %*% b)
  pps4 <- t(plyr::adply(pp4, 2, .fun=ave, data$pubid,FUN=sum)[,-1])
  p4 <- pp4[csel,]/pps4[csel,]
  # difference  bw match and non-match for distance same; difference... for distance plus 1:
  # (has nsim rows, with the mean diff across cand with a certain cind in each draw)
  out <- data.frame(cind = cind,
                    #pp = colMeans(p4),
                    diff = colMeans(p4)-1/18
  ) 
  return(out)
}


mysum <- function(x){
  out <- data.frame(
    md = mean(x$diff)
  )
}

mysum2 <- function(x){
  out <- data.frame(
    lo_diff = quantile(x$md, .025),
    mean_diff = mean(x$md),
    hi_diff = quantile(x$md, .975)
  )
  return(out)
}

# this one calculates the diff bw diff in treated group to diff in ctrl group
mysum3 <- function(x){
  x <- reshape(x, idvar = "simind", timevar = "modelno",
               v.names = "md", direction = "wide")
  out <- data.frame(
    lo_difftoc.2 = quantile(x$md.2-x$md.1, .025),
    mean_difftoc.2 = mean(x$md.2-x$md.1),
    hi_difftoc.2 = quantile(x$md.2-x$md.1, .975),
    lo_difftoc.3 = quantile(x$md.3-x$md.1, .025),
    mean_difftoc.3 = mean(x$md.3-x$md.1),
    hi_difftoc.3 = quantile(x$md.3-x$md.1, .975),
    lo_difftoc.4 = quantile(x$md.4-x$md.1, .025),
    mean_difftoc.4 = mean(x$md.4-x$md.1),
    hi_difftoc.4 = quantile(x$md.4-x$md.1, .975)
  )
  return(out)
}


# Calculate quantities of interest 

# dataset of permutations of parameter specs
myparsdist <- tidyr::expand_grid(cind = 1:18,
                                 modelno = 1:4,
                                 targetdist = c("env_score_diff","lr_diff"),
                                 pidonly = FALSE)
system.time(mysimdist <- plyr::mdply(myparsdist, myfundist, data = d) )
# nrows is (myparsdist * mynsim)

#save(mysimdist, file ="mysimdist.RData")
#load(file ="C:/Users/quossfa/Dropbox/Env_Voting/code/replication-harvard-dataverse/mysimdist.RData")

# number the draws:
mysimdist$simind <- ave(mysimdist$diff, mysimdist$cind, mysimdist$modelno, mysimdist$targetdist,
                        FUN = seq_along) 
# take the mean across candidates (across candidate indices) in each draw
# this will have an Nobs = nsim* 2 (dim) * 4 (groups):
mysimdistci0 <- plyr::ddply(mysimdist, c("modelno","targetdist", "simind"),
                            mysum )

# take mean and quantiles across draws:
mysimdistci <- plyr::ddply(mysimdistci0, c("modelno","targetdist"),
                           mysum2)
# the apply is over cind * nsim observations (which are means across the cands within cind)

# diff of the diff in the treatment group and the diff in the ctrl group
tmp <- plyr::ddply(mysimdistci0, c("targetdist"),
                   mysum3)
tmp <- reshape(tmp, idvar = "targetdist", timevar = "modelno",
               varying = 2:ncol(tmp), direction = "long")
mysimdistci <- merge(mysimdistci, tmp, all = TRUE)
rm(mysimdistci0)

## now the same procedure for matches
myparsmatch <- tidyr::expand_grid(cind = 1:18, modelno = 1:4,
                                  targetmatch = c("gendermatch","agematch", "localitymatch"))
system.time(mysimmatch <- plyr::mdply(myparsmatch, myfunmatch, data = d)) 

#save(mysimmatch, file ="mysimmatch.RData")
#load(file ="C:/Users/quossfa/Dropbox/Env_Voting/code/replication-harvard-dataverse/mysimmatch.RData")

mysimmatch$simind <- ave(mysimmatch$diff, 
                         mysimmatch$cind, mysimmatch$modelno, mysimmatch$targetmatch,
                         FUN = seq_along) 
# take the mean across candidates (across candidate indices) in each draw
mysimmatchci0 <- plyr::ddply(mysimmatch, c("modelno","targetmatch", "simind"),
                             mysum )
# take mean and quantiles across draws
mysimmatchci <- plyr::ddply(mysimmatchci0, c("modelno","targetmatch"),
                            mysum2)
# the apply is over cind * nsim observations (which are means across the cands witin cind)

# diff of the diff in the treatment group to the diff in the ctrl group
tmp <- plyr::ddply(mysimmatchci0, c("targetmatch"),
                   mysum3)
tmp <- reshape(tmp, idvar = "targetmatch", timevar = "modelno",
               varying = 2:ncol(tmp), direction = "long")
mysimmatchci <- merge(mysimmatchci, tmp, all = TRUE)


# Graphs 

dgr1 <- mysimdistci
dgr1 <- reshape(dgr1, idvar = c("modelno","targetdist"),
                timevar = "var", direction = "long",
                varying = 3:8, sep = "_")
dgr1$group <- ifelse(dgr1$modelno == 1,  "ctrl", "info")
dgr1$modelno <- factor(dgr1$modelno)
levels(dgr1$modelno) <- c("Ctrl","Inf:Env.","Inf:L-R","Inf:*2")
dgr1$targetdist <- factor(ifelse(dgr1$targetdist == "lr_diff", "Dimension: L-R", "Dimension: Envir."))
dgr1$targetdist <- factor(dgr1$targetdist,levels(dgr1$targetdist)[c(2,1)])
dgr1$var <- factor(dgr1$var)
levels(dgr1$var) <- c("Difference to observed","Difference to ctrl-group diff.")

ggplot(dgr1, 
       aes(x = modelno, y = mean, 
           ymin = lo, ymax = hi)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_pointrange(position=position_dodge2(width=.35), colour = "blue") +
  facet_grid(var ~ targetdist ) +
  labs(x = "Group", y = "Difference in mean of Pr(chosen first)") +
  theme_bw()
ggsave("Figures/Figure3.eps", 
       width = 16, height = 12 ,units = "cm")

dgr2 <- mysimmatchci
dgr2 <- reshape(dgr2, idvar = c("modelno","targetmatch"),
                timevar = "var", direction = "long",
                varying = 3:8, sep = "_")
dgr2$group <- ifelse(dgr2$modelno == 1,  "ctrl", "info")
dgr2$modelno <- factor(dgr2$modelno)
levels(dgr2$modelno) <- c("Ctrl","Inf:Env.","Inf:L-R","Inf:*2")
dgr2$targetmatch <- factor(dgr2$targetmatch)
levels(dgr2$targetmatch) <- c("Age","Gender","Residence")
dgr2$targetmatch <- factor(dgr2$targetmatch, levels = levels(dgr2$targetmatch)[c(2,1,3)])
dgr2$var <- factor(dgr2$var)
levels(dgr2$var) <- c("Difference to non-match","Difference to ctrl-group diff.")

ggplot(dgr2, 
       aes(x = modelno, y = mean, 
           ymin = lo, ymax = hi)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_pointrange(position=position_dodge2(width=.35), colour = "blue") +
  facet_grid(var ~ targetmatch ) +
  labs(x = "Group", y = "Difference in mean Pr(chosen first)") +
  theme_bw()

ggsave("Figures/Figure4.eps",
       width = 16, height = 12 ,units = "cm")

#save.image("do_not_upload/mysimWorkspace.RData")

# Figures A.6 and A.7 ####

# for cand-party
myorcp <- function(myd, myind = c(20,18, 19,17),
                   diff = TRUE, or = TRUE){ # first envir, then lr coef
  coefs <- myd[myind,1]
  vcov <- myd[myind, 1+myind] # 4x4
  # b1 = pty coef, b2 = cand distance coef
  # then b1-b2 effect of distance to party, b2 = effect of distance to cand
  d <- data.frame(dim=c(rep("envir",2), rep("l-r",2)),
                  estimate= rep(c("party","cand"),2))
  if(diff == FALSE){
    d$mean <- coefs
    d$var <- diag(as.matrix(vcov))
  } else if(diff == TRUE){
    d$mean <- c(coefs[1]-coefs[2],coefs[2],
                coefs[3]-coefs[4],coefs[4])
    d$var <- c(vcov[1,1] + vcov[2,2] - 2*vcov[1,2], vcov[2,2],
               vcov[3,3] + vcov[4,4] - 2*vcov[3,4], vcov[4,4])
  }
  
  d$lo <- d$mean - 1.96*sqrt(d$var)
  d$hi <- d$mean + 1.96*sqrt(d$var)
  if(or == TRUE){
    d$mean <- exp(d$mean)
    d$lo <- exp(d$lo)
    d$hi <- exp(d$hi)
  } 
  d <- d[,-4] # drop variance
  return(d)
}

d1 <- read.dta13("separatepartyfourgroup1.dta")
d2 <- read.dta13("separatepartyfourgroup2.dta")
d3 <- read.dta13("separatepartyfourgroup3.dta")
d4 <- read.dta13("separatepartyfourgroup4.dta")

d1[c(20,18, 19,17),1] # envir pty, env cand dist change, l-r pty, l-r cand distch
d2[c(20,18, 19,17),1]
d3[c(20,18, 19,17),1]
d4[c(20,18, 19,17),1]

# (myor(d1))

dcp <- plyr::ldply(list(d1,d2,d3,d4), myorcp, diff = FALSE, or = FALSE)
dcp$group <- rep(c("ctrl","envir","l-r","envir + l-r"), each =4)

dcpdiff <- plyr::ldply(list(d1,d2,d3,d4), myorcp, diff = TRUE, or = FALSE)
dcpdiff$group <- rep(c("ctrl","envir","l-r","envir + l-r"), each =4)

#save(dcp, dcpdiff, file="Oddsratios_v4.RData")

dcp <- dcp %>%
  plyr::mutate(group = fct_relevel(as.factor(group), "ctrl", "envir", "l-r", "envir + l-r"),
               estimate = fct_relevel(as.factor(estimate), "party", "cand"),
               dim = fct_relevel(as.factor(dim),  "l-r", "envir")
  )

dcpdiff <- dcpdiff %>%
  plyr::mutate(group = fct_relevel(as.factor(group), "ctrl", "envir", "l-r", "envir + l-r"),
               estimate = fct_relevel(as.factor(estimate), "party", "cand"),
               dim = fct_relevel(as.factor(dim),  "l-r","envir"))

dim.labs <- c("Dimension: Envir.", "Dimension: L-R")
names(dim.labs) <- c("envir", "l-r")

# Figure A.6
ggplot(dcp, aes(group, mean, group=estimate, colour=estimate)) +
  geom_point(position=position_dodge(width=.3)) +
  geom_hline(yintercept= 0, linetype = "dashed") +
  facet_grid(~dim,
             labeller = labeller(dim = dim.labs)) +
  geom_pointrange(aes(ymin = lo, ymax = hi),
                  position=position_dodge(width=.3)) +
  scale_color_manual(values=c("black","blue"), 
                     labels = c(expression(hat(beta)),expression(hat(gamma)))) +
  theme(panel.grid.major = element_line(size = 0.5, linetype = 'solid',
                                    colour = "white"),
    panel.grid.minor = element_line(size = 0.25, linetype = 'solid',
                                    colour = "white"),
    legend.position = "bottom") +
  labs(title = "Effect of increase in distance by one unit",
       x = "treatment group",
       y = "Marginal effect (log-odds)" )
ggsave("Figrures/FigureA6.eps",
       width = 7, height = 5, dpi = 400)

# Figure A.7
ggplot(dcpdiff, aes(group, mean, group=estimate, colour=estimate)) +
  geom_point(position=position_dodge(width=.3)) +
  geom_hline(yintercept= 0, linetype = "dashed") +
  facet_grid(~dim,
             labeller = labeller(dim = dim.labs)) +
  geom_pointrange(aes(ymin = lo, ymax = hi),
                  position=position_dodge(width=.3)) +
  scale_color_manual(values=c("red","blue"),
                     labels = c(expression(hat(beta)-hat(gamma)),
                                expression(hat(gamma)))) +
  theme(panel.grid.major = element_line(size = 0.5, linetype = 'solid',
                                        colour = "white"),
        panel.grid.minor = element_line(size = 0.25, linetype = 'solid',
                                        colour = "white"),
        legend.position = "bottom") +
  labs(title = "Effect of increase in distance by one unit",
       x = "treatment group",
       y = "Marginal effect (log-odds)"  )
ggsave("Figures/FigureA7.eps",
       width = 7, height = 5, dpi = 400)
